package com.digitalpebble.azazello.tika; import java.io.ByteArrayInputStream; import java.io.InputStream; import org.apache.spark.api.java.function.Function; import org.apache.tika.Tika; import com.digitalpebble.azazello.Document; public class TikaFunction implements Function<Document, Document> { private static Tika tika; static { tika = new Tika(); } @Override public Document call(Document doc) throws Exception { // enrich the behemoth document by adding its text + other metadata // obtained from Tika String mimeType = tika.detect(doc.getBinaryContent(), doc.getUri()); InputStream is = new ByteArrayInputStream(doc.getBinaryContent()); doc.setText(tika.parseToString(is)); is.close(); return doc; } }